!pip install librosa
import numpy as np
import torchvision
import torchvision.transforms as transforms
import torch
from torch import nn, optim
from torch.utils import data
import random
from time import time
import random
from random import randrange
import librosa
from IPython.display import Audio
from scipy.signal import stft
from scipy.signal import istft
import scipy as sp
import math
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
device = torch.device('cuda:0')
# Upsample clean signal with 19 silent frames to to get output of 2459 by 513
rand_X = np.random.normal(0,0.001,(19,513))
rand_X_abs = np.abs(rand_X)
X_T1 = np.concatenate((rand_X_abs,X_T))
# Create patches of 20 by 513 input signal and 1 by 513 output signal
X_patch = []
Y_patch = []
for i in range(0,S_T.shape[0]):
X_patch.append(X_T1[i:i+20,:])
Y_patch.append(S_T[i,:])
# Create tensor object of patches
X_tensor = torch.tensor(X_patch, dtype=torch.float32).to(device)
Y_tensor = torch.tensor(Y_patch, dtype=torch.float32).to(device)
# Load train data with batch size of 128
train_dataset = torch.utils.data.TensorDataset(X_tensor,Y_tensor)
trainloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=128, shuffle=True)
# Defining model architecture
model=nn.Sequential(nn.Conv2d(1,16, kernel_size=(3,3), stride=1),
nn.ReLU(),
nn.Conv2d(16,32, kernel_size=(3,3), stride=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=(2,2),stride=2),
nn.ReLU(),
nn.Conv2d(32,6, kernel_size=(3,3),stride=1),
nn.ReLU(),
nn.AvgPool2d(kernel_size=(2,2),stride=2),
nn.ReLU(),
nn.Flatten(),
nn.Linear(2268,1024),
nn.ReLU(),
nn.Linear(1024,513),
nn.ReLU(),
).to(device)
print(model)
# defining the Mean squared error for loss function
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters())
start = time()
epochs = 200
for i in range(epochs):
running_loss = 0
for x, y in trainloader:
x=x.to(device)
y=y.to(device)
optimizer.zero_grad()
output = model(x.view(-1,1,20,513))
loss = criterion(output, y)
#This is where the model learns by backpropagating
loss.backward()
#And optimizes its weights here
optimizer.step()
running_loss += loss.item()
else:
print("Epoch {} - Training loss: {}".format(i, running_loss/len(trainloader)))
print("\nTraining Time =",time()-start)
# Saving the model for future purposes (optional)
torch.save(model,'denoise_problem2.h5')
# Load the model (optional)
model = torch.load('denoise_problem2.h5')
# Getting clean O/P for train noisy signal
with torch.no_grad():
out = model(X_tensor.view(-1,1,20,513))
out_numpy = out.cpu().numpy()
print("Number of output ex and features:",out_numpy.shape)
# recovered complex stft of train noisy signal
# recover clean signal using inverse STFT
X_norm = np.divide(X,X_abs)
S_pred = np.multiply(X_norm,out_numpy.T)
s_pred = librosa.istft(S_pred, hop_length=512)
print("Length of clean signal:",s_pred.shape)
Audio(s_pred,rate=sr)
# Calculating SNR for the recovered train signal
SNR = 10*math.log10(np.sum(s[:len(s_pred),]**2)/np.sum((s[:len(s_pred),]-s_pred)**2))
print(SNR)
# Load test noisy signal
# STFT and take absolute which will be fed to the network
x_test1, sr_test1=librosa.load('test_x_01.wav', sr=None)
X_test1=librosa.stft(x_test1, n_fft=1024, hop_length=512)
X_test_abs1 = np.abs(X_test1)
x_test2, sr_test2=librosa.load('test_x_02.wav', sr=None)
X_test2=librosa.stft(x_test2, n_fft=1024, hop_length=512)
X_test_abs2 = np.abs(X_test2)
# Take transpose to be consistent with the network I/P
X_test_T1 = X_test_abs1.T
X_test_T2 = X_test_abs2.T
# Upsample noisy signal with 19 silent frames to to get output of same size
rand_X1 = np.random.normal(0,0.001,(19,513))
rand_X2 = np.random.normal(0,0.001,(19,513))
rand_X_abs1 = np.abs(rand_X1)
rand_X_abs2 = np.abs(rand_X2)
X_T1 = np.concatenate((rand_X_abs1,X_test_T1))
X_T2 = np.concatenate((rand_X_abs2,X_test_T2))
# Create patches of 20 by 513 for test signal 1
X_test_patch1 = []
for i in range(0,X_T1.shape[0]):
if i+20 <= X_T1.shape[0]:
X_test_patch1.append(X_T1[i:i+20,:])
# Create patches of 20 by 513 for test signal 2
X_test_patch2 = []
for i in range(0,X_T2.shape[0]):
if i+20 <= X_T2.shape[0]:
X_test_patch2.append(X_T2[i:i+20,:])
X_test_tensor1 = torch.tensor(X_test_patch1,dtype=torch.float32).to(device)
X_test_tensor2 = torch.tensor(X_test_patch2,dtype=torch.float32).to(device)
# Getting clean O/P for test noisy signal
with torch.no_grad():
out1 = model(X_test_tensor1.view(-1,1,20,513))
out_numpy1 = out1.cpu().numpy()
with torch.no_grad():
out2 = model(X_test_tensor2.view(-1,1,20,513))
out_numpy2 = out2.cpu().numpy()
print("Number of output ex and features for test signal 1:",out_numpy1.shape)
print("Number of output ex and features for test signal 2:",out_numpy2.shape)
# recovered complex stft of test noisy signal 1
# recover clean signal using inverse STFT
X_norm1 = np.divide(X_test1,X_test_abs1)
S_pred1 = np.multiply(X_norm1,out_numpy1.T)
s_pred1 = librosa.istft(S_pred1, hop_length=512)
print("Length of clean signal:",s_pred1.shape)
Audio(s_pred1,rate=sr_test1)
# recovered complex stft of test noisy signal 2
# recover clean signal using inverse STFT
X_norm2 = np.divide(X_test2,X_test_abs2)
S_pred2 = np.multiply(X_norm2,out_numpy2.T)
s_pred2 = librosa.istft(S_pred2, hop_length=512)
print("Length of clean signal:",s_pred2.shape)
Audio(s_pred2,rate=sr_test2)